Final Project - Indicators of Anxiety or Depression Based on Reported Frequency of Symptoms During Last 7 Days

Author

Ian Walsh & Logan Rosell

Published

November 12, 2025

Reseach Question: How did anxiety and depression levels differ between states and regions following the outbreak of COVID-19 in the United States?

Data Cleaning

Import libraries and dataset

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
from sklearn.metrics import PredictionErrorDisplay
from scipy.stats import probplot
from statsmodels.stats.outliers_influence import variance_inflation_factor

warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

df = pd.read_csv("./Datasets/Indicators_of_Anxiety_or_Depression_Based_on_Reported_Frequency_of_Symptoms_During_Last_7_Days.csv")

df.head()
Indicator Group State Subgroup Phase Time Period Time Period Label Time Period Start Date Time Period End Date Value Low CI High CI Confidence Interval Quartile Range
0 Symptoms of Depressive Disorder National Estimate United States United States 1 1 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 23.5 22.7 24.3 22.7 - 24.3 NaN
1 Symptoms of Depressive Disorder By Age United States 18 - 29 years 1 1 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 32.7 30.2 35.2 30.2 - 35.2 NaN
2 Symptoms of Depressive Disorder By Age United States 30 - 39 years 1 1 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 25.7 24.1 27.3 24.1 - 27.3 NaN
3 Symptoms of Depressive Disorder By Age United States 40 - 49 years 1 1 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 24.8 23.3 26.2 23.3 - 26.2 NaN
4 Symptoms of Depressive Disorder By Age United States 50 - 59 years 1 1 Apr 23 - May 5, 2020 04/23/2020 05/05/2020 23.2 21.5 25.0 21.5 - 25.0 NaN

Filter to only state data and drop unnecessary columns;Group and Subgroup are redundant, Time period and CI are just combinations of other column’s data.

state_data = df[df['Group']=='By State']

state_data.drop(columns = ['Group', 'Subgroup', 'Time Period Label', 'Confidence Interval', 'Quartile Range', 'Low CI', 'High CI'], inplace = True)

Clean up the Phase column:

state_data['Phase'].unique()
# There are 2 values that contain dates which are already stored in other columns, so we can remove these dates

state_data['Phase'] = state_data['Phase'].str.split(' ', expand = True).get(0)

Add a column for Region of the united states based on the US census (Census Regions and Divisions of the United States)

state_to_region = {
     # Northeast
    'Connecticut':'Northeast', 
    'Maine': 'Northeast', 'Massachusetts':'Northeast',
    'New Hampshire': 'Northeast', 
    'Rhode Island': 'Northeast', 
    'Vermont': 'Northeast',
    'New Jersey': 'Northeast', 
    'New York': 'Northeast', 
    'Pennsylvania': 'Northeast',
    # Midwest
    'Illinois': 'Midwest', 
    'Indiana': 'Midwest', 
    'Michigan': 'Midwest', 
    'Ohio': 'Midwest',
    'Wisconsin': 'Midwest', 
    'Iowa': 'Midwest', 
    'Kansas': 'Midwest', 
    'Minnesota': 'Midwest',
    'Missouri': 'Midwest', 
    'Nebraska': 'Midwest', 
    'North Dakota': 'Midwest', 
    'South Dakota': 'Midwest',
    # South
    'Delaware': 'South', 
    'Florida': 'South', 
    'Georgia': 'South', 
    'Maryland': 'South',
    'North Carolina': 'South', 
    'South Carolina': 'South', 
    'Virginia': 'South', 
    'District of Columbia': 'South',
    'West Virginia': 'South', 
    'Alabama': 'South', 
    'Kentucky': 'South', 
    'Mississippi': 'South',
    'Tennessee': 'South', 
    'Arkansas': 'South', 
    'Louisiana': 'South', 
    'Oklahoma': 'South', 
    'Texas': 'South',
    # West
    'Arizona': 'West', 
    'Colorado': 'West', 
    'Idaho': 'West', 
    'Montana': 'West',
    'Nevada': 'West', 
    'New Mexico': 'West', 
    'Utah': 'West', 
    'Wyoming': 'West',
    'Alaska': 'West', 
    'California': 'West', 
    'Hawaii': 'West', 
    'Oregon': 'West', 
    'Washington': 'West'
}

state_data['Region'] = state_data['State'].map(state_to_region)

Change Data Types as needed

state_data['Indicator'] = pd.Categorical(state_data['Indicator'], categories = ['Symptoms of Depressive Disorder', 'Symptoms of Anxiety Disorder', 'Symptoms of Anxiety Disorder or Depressive Disorder'])

state_data['Phase'] = pd.Categorical(state_data['Phase'], categories=['1', '2', '3', '3.1', '3.2', '3.3', '3.4', '3.5', '3.6', '3.7', '3.8', '3.9', '3.10'])

state_data['Time Period Start Date'] = pd.to_datetime(state_data['Time Period Start Date']).dt.date
state_data['Time Period End Date'] = pd.to_datetime(state_data['Time Period End Date']).dt.date

In a new DataFrame, pivot so that the Indicator column and it’s respective values are actually seperate columns to make some of the data analysis easier:

state_data_wide = pd.pivot(state_data, index = ['State','Phase','Time Period','Time Period Start Date', 'Time Period End Date', 'Region'], columns = 'Indicator', values = 'Value').reset_index()

state_data_wide.rename(columns={"Symptoms of Depressive Disorder": "Perc of Pop with Symptoms of Depression", "Symptoms of Anxiety Disorder": "Perc of Pop with Symptoms of Anxiety", "Symptoms of Anxiety Disorder or Depressive Disorder": "Perc of Pop with Symptoms of Either"}, inplace = True)

# Rename Value column in original DataFrame as well for consistency
state_data.rename(columns={"Value": "Percent of Population"}, inplace = True)

EDA

Pairplot

First, let’s do a pairplot of the wide dataset (to get an idea of how depression and anxiety rates are correlated with each other and time)

# Pair Plot
pair_plot = sns.pairplot(state_data_wide, hue = 'Region', plot_kws={'alpha': 0.4})
plt.show()

Observations: The percent of the population with symptoms of anxiety, depression,and either all appear to be highly linearly correlated with each other. Additionally, they all change over time similarly, all starting fairly high and then getting lower over time. This appears consistent among all regions, however the midwest tends to have notably lower rates of symptoms for anxiety, depression, and both.

Histogram

Now we can do a histogram and look at measures of center

# Histogram of values for all states
sns.histplot(state_data, x='Percent of Population', hue = 'Indicator', alpha = 0.5)
plt.title('Histogram of Percent of Population by Anxiety/Depression')
plt.show()

measures = state_data.groupby('Indicator', observed = False)['Percent of Population'].agg(['mean','median'])

indicators = state_data['Indicator'].unique()

for i in indicators:
    mean = measures.loc[i, 'mean']
    median = measures.loc[i, 'median']
    print(f"For {i}:\nMean = {mean}\nMedian = {median}\n")

For Symptoms of Depressive Disorder:
Mean = 23.439215686274512
Median = 23.05

For Symptoms of Anxiety Disorder:
Mean = 29.430518659076533
Median = 29.1

For Symptoms of Anxiety Disorder or Depressive Disorder:
Mean = 33.98877292852625
Median = 33.6

Observations: The percent of the population with symptoms of anxiety, depression, and either appear to all be normally distributed. As expected, the percent of the population with symptoms of either is the highest on average. However, the median and mean percent of the population with symptoms of anxiety is higher than that with depression

Bar Graph

We can use a bar graph to see if the regions have significantly different rates of symptoms of anxiety and depression

sns.barplot(state_data, x = 'Region', y = 'Percent of Population', hue = 'Indicator')
plt.show()

Observations: Rates of symptoms of depression, anxiety, or either appear to be highest in the South, and are closely followed by the West. The lowest rates of symtoms of anxiety and either anxiety or depression are in the Midwest; however, the Northeast and Midwest are fairly close as the lowest rates of symptoms of depression.

Line Graph

We can use a line graph to look at how these change over time

national_avgs = state_data.groupby(['Time Period Start Date', 'Indicator'], observed=False).agg(
    nat_means = ('Percent of Population', 'mean')
)
nat_avg_plt = sns.lineplot(national_avgs,
                            x='Time Period Start Date',
                            y='nat_means',
                            hue = 'Indicator')
plt.xticks(rotation=45)
plt.title(f"Percent of Population Over Time")
plt.ylabel('Percent of Population')
plt.show()

Observations: It looks like following the outbreak of COVID-19, rates of symptoms of anxiety and depression sharply rose, followed by a dip towards the end of 2020 and another sharp rise towards the beginning of 2021 (which notably is around the time of the 2020 federal election). After that rise, rates once again fell and reached a somewhat stable level around mid-2021. They stayed at this level until around the end of 2022 where there was a slight increase, and then decrease back to that stable level. Finally, the rates appeared to dip towards the end of 2023 and then again rise back up to the stable level. These trends appear consistent among depression symtoms, anxiety symptoms, and both. Overall, this line graph helps us visualize that the rates are not completely linear, but appear to fluctuate over time.

Map

We can graph the data over a map of the US to see regional trends over time

state_code_map = {
    "Alabama": "AL",
    "Alaska": "AK",
    "Arizona": "AZ",
    "Arkansas": "AR",
    "California": "CA",
    "Colorado": "CO",
    "Connecticut": "CT",
    "Delaware": "DE",
    "Florida": "FL",
    "Georgia": "GA",
    "Hawaii": "HI",
    "Idaho": "ID",
    "Illinois": "IL",
    "Indiana": "IN",
    "Iowa": "IA",
    "Kansas": "KS",
    "Kentucky": "KY",
    "Louisiana": "LA",
    "Maine": "ME",
    "Maryland": "MD",
    "Massachusetts": "MA",
    "Michigan": "MI",
    "Minnesota": "MN",
    "Mississippi": "MS",
    "Missouri": "MO",
    "Montana": "MT",
    "Nebraska": "NE",
    "Nevada": "NV",
    "New Hampshire": "NH",
    "New Jersey": "NJ",
    "New Mexico": "NM",
    "New York": "NY",
    "North Carolina": "NC",
    "North Dakota": "ND",
    "Ohio": "OH",
    "Oklahoma": "OK",
    "Oregon": "OR",
    "Pennsylvania": "PA",
    "Rhode Island": "RI",
    "South Carolina": "SC",
    "South Dakota": "SD",
    "Tennessee": "TN",
    "Texas": "TX",
    "Utah": "UT",
    "Vermont": "VT",
    "Virginia": "VA",
    "Washington": "WA",
    "West Virginia": "WV",
    "Wisconsin": "WI",
    "Wyoming": "WY",
    "District of Columbia": "DC",
    "American Samoa": "AS",
    "Guam": "GU",
    "Northern Mariana Islands": "MP",
    "Puerto Rico": "PR",
    "United States Minor Outlying Islands": "UM",
    "Virgin Islands, U.S.": "VI",
}

color_scales = ['blues','amp','purp']

for i,j in zip(indicators,color_scales):
    fig_data = state_data[(state_data['Indicator'] == i)]

    fig_data['State_Code'] = fig_data['State'].map(state_code_map)

    max = fig_data['Percent of Population'].max()
    min = fig_data['Percent of Population'].min()

    fig = px.choropleth(
        fig_data,
        locations='State_Code',
        locationmode='USA-states',
        color='Percent of Population',
        scope='usa',
        title=f'Map of {i} in US states',
        hover_name='State',
        color_continuous_scale=j,
        animation_frame='Time Period Start Date',
        range_color=[min,max]
    )
    fig.show()

Observations: In these 3 graphs we see similar trends to what was observed in the line graph, however, we are better able to visualize the changes among US states/regions here. Notably, while most of the US follows the similar trends over time of increased and then subsequently decreased rates of depression and anxiety, the midwest appears to be less impacted by these trends. Also notable is that states that border each other definitely seem to get darker and lighter at the same time, meaning that rates appear to increase and decrease regionally, even if states in other regions aren’t changing much in a specific time period.

Linear Modeling

def give_me_one_period(df, time_period):
    trimmed_df = df[df['Time Period'] == time_period]
    return trimmed_df
def model_eval(df, x_col, y_col):
    x_reshaped = df[x_col].values.reshape(-1,1)
    model = LinearRegression().fit(x_reshaped, df[y_col])

    print('model intercept :', model.intercept_.round(3))
    print('model coefficients : ', model.coef_.round(3))
    print('R-Squared: ', model.score(x_reshaped, df[y_col]))
def give_me_vif(df, x1, x2):
    only_vars = df.loc[:,[x1, x2]]
    print(f"VIF: {variance_inflation_factor(only_vars, exog_idx=1)}")
def plot_my_simple_model(df, x_col, y_col, model):
    plot = sns.lmplot(df, x = x_col, y = y_col)
    plt.show()
    
    x_reshaped = df[x_col].values.reshape(-1,1)
    predErrDisp_model1 = PredictionErrorDisplay(y_true=df[y_col], y_pred=model.predict(x_reshaped))
    predErrDisp_model1.plot()
    plt.show()

Modeling the percentage of the population with symptoms of anxiety as a function of the percentage of the population with symptoms of depression from time period one

period_one = give_me_one_period(state_data_wide, 1)

depression_reshaped = period_one['Perc of Pop with Symptoms of Depression'].values.reshape(-1,1)
anxiety_data = period_one['Perc of Pop with Symptoms of Anxiety']

anxiety_v_depression_model = LinearRegression().fit(depression_reshaped, anxiety_data)

model_eval(period_one, 'Perc of Pop with Symptoms of Depression', 'Perc of Pop with Symptoms of Anxiety')
model intercept : 8.364
model coefficients :  [0.955]
R-Squared:  0.6077103706269704

Plotting the anxiety-depression relationship

plot_my_simple_model(period_one,
                    'Perc of Pop with Symptoms of Depression',
                    'Perc of Pop with Symptoms of Anxiety',
                    anxiety_v_depression_model
                    )

depression_reshaped = state_data_wide['Perc of Pop with Symptoms of Depression'].values.reshape(-1,1)
time_data = state_data_wide['Time Period']

depression_v_time_model = LinearRegression().fit(depression_reshaped, time_data)

model_eval(state_data_wide, 'Perc of Pop with Symptoms of Depression', 'Time Period')
model intercept : 63.748
model coefficients :  [-1.376]
R-Squared:  0.10643280425364254
plot_my_simple_model(state_data_wide,
                    'Perc of Pop with Symptoms of Depression',
                    'Time Period',
                    depression_v_time_model
                    )